{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import os\n",
        "\n",
        "from functools import partial\n",
        "from operator import itemgetter\n",
        "from torchdata.datapipes.iter import (\n",
        "    FileOpener,\n",
        "    HttpReader,\n",
        "    IterableWrapper,\n",
        "    SampleMultiplexer,\n",
        ")\n",
        "\n",
        "ROOT_DIR = os.path.expanduser('~/.torchdata/CC100')  # This directory needs to be crated and set\n",
        "\n",
        "\n",
        "def _path_fn(root, x):\n",
        "    return os.path.join(root, os.path.basename(x).rstrip(\".xz\"))\n",
        "\n",
        "def _process_tuple(language_code, t):\n",
        "    return language_code, t[1].decode()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [],
      "source": [
        "# CC100 support (http://data.statmt.org/cc-100/)\n",
        "\n",
        "URL=\"http://data.statmt.org/cc-100/%s.txt.xz\"\n",
        "VALID_CODES = [\n",
        "    \"am\", \"ar\", \"as\", \"az\", \"be\", \"bg\", \"bn\", \"bn_rom\", \"br\", \"bs\", \"ca\", \"cs\", \"cy\", \"da\", \"de\",\n",
        "    \"el\", \"en\", \"eo\", \"es\", \"et\", \"eu\", \"fa\", \"ff\", \"fi\", \"fr\", \"fy\", \"ga\", \"gd\", \"gl\", \"gn\", \"gu\",\n",
        "    \"ha\", \"he\", \"hi\", \"hi_rom\", \"hr\", \"ht\", \"hu\", \"hy\", \"id\", \"ig\", \"is\", \"it\", \"ja\", \"jv\", \"ka\",\n",
        "    \"kk\", \"km\", \"kn\", \"ko\", \"ku\", \"ky\", \"la\", \"lg\", \"li\", \"ln\", \"lo\", \"lt\", \"lv\", \"mg\", \"mk\", \"ml\",\n",
        "    \"mn\", \"mr\", \"ms\", \"my\", \"my_zaw\", \"ne\", \"nl\", \"no\", \"ns\", \"om\", \"or\", \"pa\", \"pl\", \"ps\", \"pt\",\n",
        "    \"qu\", \"rm\", \"ro\", \"ru\", \"sa\", \"si\", \"sc\", \"sd\", \"sk\", \"sl\", \"so\", \"sq\", \"sr\", \"ss\", \"su\", \"sv\",\n",
        "    \"sw\", \"ta\", \"ta_rom\", \"te\", \"te_rom\", \"th\", \"tl\", \"tn\", \"tr\", \"ug\", \"uk\", \"ur\", \"ur_rom\", \"uz\",\n",
        "    \"vi\", \"wo\", \"xh\", \"yi\", \"yo\", \"zh-Hans\", \"zh-Hant\", \"zu\",\n",
        "]\n",
        "\n",
        "def CC100(root, language_code, use_caching=True):\n",
        "    if language_code not in VALID_CODES:\n",
        "        raise ValueError(f\"Invalid language code {language_code}\")\n",
        "    url = URL % language_code\n",
        "    if use_caching:\n",
        "        cache_compressed_dp = HttpReader(cache_compressed_dp).map(itemgetter(0))\n",
        "        cache_compressed_dp = cache_compressed_dp.end_caching(mode=\"wb\", same_filepath_fn=True)\n",
        "        cache_decompressed_dp = cache_compressed_dp.on_disk_cache(filepath_fn=partial(_path_fn, root))\n",
        "        cache_decompressed_dp = FileOpener(cache_decompressed_dp).read_from_xz()\n",
        "        cache_decompressed_dp = cache_decompressed_dp.end_caching(mode=\"wb\")\n",
        "        data_dp = FileOpener(cache_decompressed_dp)\n",
        "    else:\n",
        "        data_dp = HttpReader([url]).read_from_xz()\n",
        "    units_dp = data_dp.readlines().map(partial(_process_tuple, language_code))\n",
        "    return units_dp\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "('en', 'Belmont Estate is on the market for $63 million and boasts roughly 22,000 square feet of luxurious finishes and elaborate architecture on 1.28 acres. Listed on Thursday, the home is being sold by high-end real estate firm Sotheby’s International Realty Canada.')\n",
            "('en', '“Within the city we’ve had homes that have sold for $56 million, $33 million, $31 million but this will be the record of the offering price,” listing agent Christa Frosch of Sotheby’s tells BuzzBuzzNews.')\n",
            "('en', 'The three-storey home has five bedrooms, twelve bathrooms and an elevator in the west wing. Built to entertain, two main gallery halls can seat up to 100 guests. The Italian-inspired kitchen includes a fireplace and walls and ceilings throughout the home feature murals and artwork. Lavish amenities include an indoor pool and sauna, a six-car garage and a private entrance in-law’s suite.')\n",
            "('en', 'Surrounding the property is a Versailles-inspired garden with a variety of trees, plants and an orchard. In the spring, over 12,000 flowers bloom in the tiered, three-level garden.')\n",
            "('en', 'According to Frosch, the listing has received global attention and, despite being on the market for only 24 hours, buyers are already showing interest.')\n",
            "('en', '“We just went to the market yesterday, it’s private through Sotheby’s and we’ve already started to get calls,” says Frosch.')\n",
            "('en', '')\n",
            "Execution time 0.55 secs\n"
          ]
        }
      ],
      "source": [
        "# Sample from multi-gigabyte-size compressed dataset without downloading the whole thing\n",
        "# This executes very fast\n",
        "import time\n",
        "start_time = time.time()\n",
        "for i, x in enumerate(CC100(ROOT_DIR, 'en', use_caching=False)):\n",
        "    print(x)\n",
        "    if i > 5:\n",
        "        break\n",
        "print(f\"Execution time {(time.time() - start_time):.2f} secs\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "('ha',\n",
              " 'Dangantaka tsakanin kasashen Masar da Turkiya ta yi tsami a cikin yan kwanakin nan, saboda sanin iyakokin da kowanne daga cikin yake mallaka a tekun Mediterranean .')"
            ]
          },
          "execution_count": 5,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# cache\n",
        "# This cell is very slow to run the first time as it downloads a dataset from a very slow server\n",
        "next(iter(CC100(ROOT_DIR, 'ha', use_caching=True)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "('yi', 'קאַטעגאָריע:cs-- – װיקיװערטערבוך')"
            ]
          },
          "execution_count": 5,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# cache\n",
        "# This cell is very slow to run the first time as it downloads a dataset from a very slow server\n",
        "next(iter(CC100(ROOT_DIR, 'yi', use_caching=True)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[('li', \"Kop van 't Ende - Wikipedia\"), ('li', ''), ('li', \"Coos is 'n in 1853 gestiech graofsjap in Oregon, VS. Coos is verneump nao de Cook-koo-oose, 'n inheims Amerikaans stam, die allewijl neet mie besteit. De hoofplaots vaan 't graofsjap is Coquille.\"), ('ha', 'Dangantaka tsakanin kasashen Masar da Turkiya ta yi tsami a cikin yan kwanakin nan, saboda sanin iyakokin da kowanne daga cikin yake mallaka a tekun Mediterranean .'), ('yi', 'קאַטעגאָריע:cs-- – װיקיװערטערבוך'), ('li', \"'t Graofsjap heet 'n totaal oppervlak vaan 4.678 km² boevaan 4.145 km² land is en 533 km² water.\"), ('ha', \"Kamfanin dillancin labaran IRNA na kasar Iran ya nakalto Ahmad Abu-Zaid kakakin ma'aikatar harkokin wajen kasar Masar yarjejeniyar da kasar Masar ta cimma da kasar Cyprus kan iyakokin da kowanne daga cikinsu yake mallaka daga gabacin tekun Mediterranean ta zama doka ce, kuma duk wanda yayi kokarin taka ta Masar zata kalubalance shi.\"), ('ha', 'Abu-Zaid ya kara da cewa yarjejeniyar rabon kan iyaka a cikin tekun Mediterranean , yarjejjeniya ce ta kasa da kasa wacce Majalisar dinkin duniya ta amince da ita.'), ('li', \"Volgens de census vaan 2000 bedroog 't totaol bevolkingsaontal in Coos County 62.779.\"), ('ha', 'Amma ministan harkokin wajen kasar Turkiya Maulud Chavis-Uglu, a ranar litinin da ta gabata ce ya bada sanarwan cewa kasar Turkiya ba ta amince da yarjejeniyar da kasashen Masar ta Cyprus suka cimma kan rabon kan iyaka da kuma amfani da tekun Mediterranean a shekara ta 2013 ba.'), ('li', \"De twie belaankriekste plaotse vaan 't graofsjap zien:\"), ('ha', 'Wani Sabon Sabani Ya Kunno kai Tsakanin Kasashen Masar Da Turkiyya'), ('li', \"Gesjreve in 't Mestreechs\"), ('li', \"Dees pazjena is 't lèts verangerd op 9 mrt 2013, 04:24.\"), ('ha', 'Masar Ta Zargi Mahukuntan Turkiyya Da Kokarin Yin Zagon Kasa Ga Harkar Tattalin Arzikin Kasarta'), ('li', ''), ('li', \"'ne Centimeter (aofkorting: cm) is geliek aon 'ne hoonderdste meter, ofwel 0,01 meter. Dit is weer geliek aon 10 millimeter. 't Voorveugsel centi is aofkomsteg vaan 't Latiense centum, wat hoonderd beteikent. In 't dageleks leve weurt de maot dèks gebruuk: me gebruuk 't veur 't mete vaan liechaamslengde, meubelaofmetinge, kleiding, enz. In technische teikeninge gebruuk me liever de millimeter.\"), ('ha', ''), ('li', \"'n Meetlint weurt dèks ouch 'ne centimeter geneump.\"), ('li', \"Gesjreve in 't Mestreechs\")]\n",
            "Expected ratio: 0.7, actual 0.699058\n"
          ]
        }
      ],
      "source": [
        "import itertools\n",
        "# Cache two of the datasets. The backend rate-limits connections to 1 per ip,\n",
        "# so you can't have more than one dataset running without caching\n",
        "\n",
        "# If you do \"run all\" this may fail because the previous http connections might still be alive\n",
        "\n",
        "z1 = CC100(ROOT_DIR, 'li', use_caching=False).cycle()\n",
        "z2 = CC100(ROOT_DIR, 'ha', use_caching=True).cycle()\n",
        "z3 = CC100(ROOT_DIR, 'yi', use_caching=True).cycle()\n",
        "\n",
        "z = SampleMultiplexer({z1: 0.7, z2: 0.2, z3: 0.1})\n",
        "\n",
        "l = list(itertools.islice(z, 0, 500000))\n",
        "print(l[0:20])\n",
        "\n",
        "ratio = sum(1 for k,v in l if k == 'li') / len(l)\n",
        "print(f\"Expected ratio: 0.7, actual {ratio}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "pycharm": {
          "name": "#%%\n"
        }
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "('ha',\n",
              " \"Dangantaka tsakanin kasashen Masar da Turkiya ta yi tsami a cikin yan kwanakin nan, saboda sanin iyakokin da kowanne daga cikin yake mallaka a tekun Mediterranean .\\nKamfanin dillancin labaran IRNA na kasar Iran ya nakalto Ahmad Abu-Zaid kakakin ma'aikatar harkokin wajen kasar Masar yarjejeniyar da kasar Masar ta cimma da kasar Cyprus kan iyakokin da kowanne daga cikinsu yake mallaka daga gabacin tekun Mediterranean ta zama doka ce, kuma duk wanda yayi kokarin taka ta Masar zata kalubalance shi.\\nAbu-Zaid ya kara da cewa yarjejeniyar rabon kan iyaka a cikin tekun Mediterranean , yarjejjeniya ce ta kasa da kasa wacce Majalisar dinkin duniya ta amince da ita.\\nAmma ministan harkokin wajen kasar Turkiya Maulud Chavis-Uglu, a ranar litinin da ta gabata ce ya bada sanarwan cewa kasar Turkiya ba ta amince da yarjejeniyar da kasashen Masar ta Cyprus suka cimma kan rabon kan iyaka da kuma amfani da tekun Mediterranean a shekara ta 2013 ba.\\nWani Sabon Sabani Ya Kunno kai Tsakanin Kasashen Masar Da Turkiyya\\nMasar Ta Zargi Mahukuntan Turkiyya Da Kokarin Yin Zagon Kasa Ga Harkar Tattalin Arzikin Kasarta\")"
            ]
          },
          "execution_count": 8,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "next(iter(CC100(ROOT_DIR, 'ha', use_caching=False).lines_to_paragraphs()))"
      ]
    }
  ],
  "metadata": {
    "fileHeader": "",
    "fileUid": "6a787fbf-7d19-47c8-ae45-d670856ce03b",
    "isAdHoc": false,
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "bento_kernel_default"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.5"
    }
  }
}
